In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
In [6]:
df= pd.read_csv(r"C:\Users\syed sahel\Downloads\customers_sample.csv")
df.head()
Out[6]:
CustomerID Age Gender Annual_Income Spending_Score Favorite_Genre
0 1 54 Female 30260 81 Romance
1 2 67 Male 51855 65 Comedy
2 3 44 Male 56393 97 Sci-Fi
3 4 30 Male 82355 100 Drama
4 5 58 Male 12688 70 Action

3️⃣ Data Cleaning & Preprocessing#

In [7]:
df.isnull().sum()
Out[7]:
CustomerID        0
Age               0
Gender            0
Annual_Income     0
Spending_Score    0
Favorite_Genre    0
dtype: int64
In [8]:
df = df.dropna()
In [9]:
df = df.drop_duplicates()
In [10]:
df = pd.get_dummies(df, drop_first=True)

4️⃣ Exploratory Data Analysis (EDA)¶

In [11]:
df.info()
df.describe()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   CustomerID                  500 non-null    int64
 1   Age                         500 non-null    int64
 2   Annual_Income               500 non-null    int64
 3   Spending_Score              500 non-null    int64
 4   Gender_Male                 500 non-null    uint8
 5   Gender_Other                500 non-null    uint8
 6   Favorite_Genre_Comedy       500 non-null    uint8
 7   Favorite_Genre_Documentary  500 non-null    uint8
 8   Favorite_Genre_Drama        500 non-null    uint8
 9   Favorite_Genre_Horror       500 non-null    uint8
 10  Favorite_Genre_Romance      500 non-null    uint8
 11  Favorite_Genre_Sci-Fi       500 non-null    uint8
dtypes: int64(4), uint8(8)
memory usage: 23.4 KB
Out[11]:
CustomerID Age Annual_Income Spending_Score Gender_Male Gender_Other Favorite_Genre_Comedy Favorite_Genre_Documentary Favorite_Genre_Drama Favorite_Genre_Horror Favorite_Genre_Romance Favorite_Genre_Sci-Fi
count 500.000000 500.000000 500.000000 500.000000 500.0000 500.000000 500.000000 500.000000 500.000000 500.000000 500.000000 500.000000
mean 250.500000 43.312000 57371.108000 90.838000 0.4800 0.026000 0.206000 0.066000 0.182000 0.116000 0.106000 0.144000
std 144.481833 15.577161 24048.352901 12.699382 0.5001 0.159295 0.404836 0.248531 0.386231 0.320546 0.308146 0.351441
min 1.000000 16.000000 8000.000000 37.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 125.750000 30.750000 41230.000000 84.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 250.500000 44.000000 56877.000000 98.000000 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 375.250000 56.000000 72737.250000 100.000000 1.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 500.000000 69.000000 120810.000000 100.000000 1.0000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [13]:
sns.pairplot(df)
plt.show()
In [14]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

✔ Histograms

In [34]:
df.hist(figsize=(12,10))
plt.show()

5️⃣ Feature Engineering

In [23]:
features = [
    "Age",
    "Annual_Income",
    "Spending_Score",
]

X = df[features]

✔ Standardize the data

In [24]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

6️⃣ Apply Clustering (K-Means)

In [25]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
In [26]:
OMP_NUM_THREADS=2

📌 Train Final K-Means Model (k=4 Example)

In [27]:
kmeans = KMeans(n_clusters=4, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)
df.head()
Out[27]:
CustomerID Age Annual_Income Spending_Score Gender_Male Gender_Other Favorite_Genre_Comedy Favorite_Genre_Documentary Favorite_Genre_Drama Favorite_Genre_Horror Favorite_Genre_Romance Favorite_Genre_Sci-Fi Cluster
0 1 54 30260 81 0 0 0 0 0 0 1 0 0
1 2 67 51855 65 1 0 1 0 0 0 0 0 0
2 3 44 56393 97 1 0 0 0 0 0 0 1 3
3 4 30 82355 100 1 0 0 0 1 0 0 0 1
4 5 58 12688 70 1 0 0 0 0 0 0 0 0

7️⃣ Visualize Clusters

✔ 2D Scatter Plot

In [28]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=df["Annual_Income"],
    y=df["Spending_Score"],
    hue=df["Cluster"],
    palette="bright",
    s=100
)
plt.title("Customer Segmentation")
plt.show()

✔ 3D Visualization

In [29]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df["Age"], df["Annual_Income"], df["Spending_Score"],
           c=df["Cluster"], s=50)
ax.set_xlabel("Age")
ax.set_ylabel("Annual Income")
ax.set_zlabel("Spending Score")
plt.title("3D Customer Segmentation")
plt.show()

8️⃣ PCA Visualization (Dimensionality Reduction)

In [30]:
pca = PCA(n_components=2)
pca_data = pca.fit_transform(X_scaled)

df['PC1'] = pca_data[:, 0]
df['PC2'] = pca_data[:, 1]

sns.scatterplot(x='PC1', y='PC2', data=df, hue='Cluster', palette='tab10', s=100)
plt.title("PCA Cluster Visualization")
plt.show()

9️⃣ Cluster Summary (Insights)

In [31]:
cluster_summary = df.groupby("Cluster")[["Age", "Annual_Income", "Spending_Score"]].mean()
cluster_summary
Out[31]:
Age Annual_Income Spending_Score
Cluster
0 58.085714 39166.657143 70.485714
1 30.379845 79309.782946 99.705426
2 30.500000 37240.032258 96.233871
3 55.323944 68481.190141 93.119718

🔟 Dashboard (Plotly)

In [32]:
import plotly.express as px

fig = px.scatter(
    df,
    x="Annual_Income",
    y="Spending_Score",
    color="Cluster",
    size="Age",
    hover_data=["Age"],
    title="Customer Segmentation Dashboard"
)
fig.show()
In [ ]: